In [1]:
sc
Out[1]:
<pyspark.context.SparkContext at 0x7fd994110650>
In [2]:
sqlContext
Out[2]:
<pyspark.sql.context.HiveContext at 0x7fd9790af890>
The tweets.json file has 243,554 tweets and takes 1.2 GiB of space. It was collected from Twitter's filter stream by searching for "trump,donaldtrump,clinton,hillaryclinton,sanders,berniesanders", and it was collected for two hours or so around noon MDT. Let's get a dictionary of languages as keys and number of tweets in those languages are values. We'll see how long this takes in old-fashioned Python.
In [3]:
import time
import simplejson as json
filename = '/home/anaconda/md0/data/2016_potus/stream/tweets.json'
langs = {}
start_time = time.time()
f_p = open(filename,'r')
for line in f_p:
tweet = json.loads(line)
if 'lang' in tweet:
if tweet['lang'] in langs:
langs[tweet['lang']] += 1
else:
langs[tweet['lang']] = 1
elapsed_time = time.time() - start_time
print "%02f seconds" % elapsed_time
# Pretty print langs as JSON
print "%s" % json.dumps(langs, indent=4)
21.767186 seconds
{
"el": 13,
"en": 226229,
"zh": 5,
"vi": 14,
"is": 3,
"it": 258,
"iw": 5,
"eu": 1,
"cy": 23,
"ar": 48,
"in": 181,
"cs": 25,
"et": 48,
"es": 4200,
"ru": 40,
"nl": 223,
"pt": 447,
"no": 28,
"lo": 1,
"tr": 255,
"lt": 8,
"lv": 4,
"tl": 223,
"th": 2,
"ro": 112,
"pl": 630,
"ta": 1,
"fr": 1409,
"de": 340,
"ht": 84,
"da": 129,
"fa": 3,
"hi": 21,
"fi": 9,
"hu": 20,
"ja": 102,
"sr": 1,
"ko": 5,
"sv": 89,
"und": 8301,
"sl": 5
}
In [5]:
import time
filename = '/home/anaconda/md0/data/2016_potus/stream/tweets.json'
start_time = time.time()
# Form a Spark dataframe and register a temp table
sdf = sqlContext.read.json(filename)
sdf.registerTempTable('tweets')
query = "select lang, count(*) as num from tweets group by lang order by num desc"
pdf = sqlContext.sql(query).toPandas()
elapsed_time = time.time() - start_time
print "%02f seconds" % elapsed_time
pdf
9.048916 seconds
Out[5]:
lang
num
0
en
226229
1
und
8301
2
es
4200
3
fr
1409
4
pl
630
5
pt
447
6
de
340
7
it
258
8
tr
255
9
tl
223
10
nl
223
11
in
181
12
da
129
13
ro
112
14
ja
102
15
sv
89
16
ht
84
17
ar
48
18
et
48
19
ru
40
20
no
28
21
cs
25
22
cy
23
23
hi
21
24
hu
20
25
vi
14
26
el
13
27
None
9
28
fi
9
29
lt
8
30
sl
5
31
zh
5
32
iw
5
33
ko
5
34
lv
4
35
is
3
36
fa
3
37
th
2
38
sr
1
39
ta
1
40
eu
1
41
lo
1
In [6]:
sdf.printSchema()
root
|-- contributors: string (nullable = true)
|-- coordinates: struct (nullable = true)
| |-- coordinates: array (nullable = true)
| | |-- element: double (containsNull = true)
| |-- type: string (nullable = true)
|-- created_at: string (nullable = true)
|-- entities: struct (nullable = true)
| |-- hashtags: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- text: string (nullable = true)
| |-- media: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- media_url: string (nullable = true)
| | | |-- media_url_https: string (nullable = true)
| | | |-- sizes: struct (nullable = true)
| | | | |-- large: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- medium: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- small: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- thumb: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | |-- source_status_id: long (nullable = true)
| | | |-- source_status_id_str: string (nullable = true)
| | | |-- source_user_id: long (nullable = true)
| | | |-- source_user_id_str: string (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- url: string (nullable = true)
| |-- symbols: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- text: string (nullable = true)
| |-- urls: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- url: string (nullable = true)
| |-- user_mentions: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- name: string (nullable = true)
| | | |-- screen_name: string (nullable = true)
|-- extended_entities: struct (nullable = true)
| |-- media: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- media_url: string (nullable = true)
| | | |-- media_url_https: string (nullable = true)
| | | |-- sizes: struct (nullable = true)
| | | | |-- large: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- medium: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- small: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- thumb: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | |-- source_status_id: long (nullable = true)
| | | |-- source_status_id_str: string (nullable = true)
| | | |-- source_user_id: long (nullable = true)
| | | |-- source_user_id_str: string (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- url: string (nullable = true)
| | | |-- video_info: struct (nullable = true)
| | | | |-- aspect_ratio: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- duration_millis: long (nullable = true)
| | | | |-- variants: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- bitrate: long (nullable = true)
| | | | | | |-- content_type: string (nullable = true)
| | | | | | |-- url: string (nullable = true)
|-- favorite_count: long (nullable = true)
|-- favorited: boolean (nullable = true)
|-- filter_level: string (nullable = true)
|-- geo: struct (nullable = true)
| |-- coordinates: array (nullable = true)
| | |-- element: double (containsNull = true)
| |-- type: string (nullable = true)
|-- id: long (nullable = true)
|-- id_str: string (nullable = true)
|-- in_reply_to_screen_name: string (nullable = true)
|-- in_reply_to_status_id: long (nullable = true)
|-- in_reply_to_status_id_str: string (nullable = true)
|-- in_reply_to_user_id: long (nullable = true)
|-- in_reply_to_user_id_str: string (nullable = true)
|-- is_quote_status: boolean (nullable = true)
|-- lang: string (nullable = true)
|-- place: struct (nullable = true)
| |-- bounding_box: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: array (containsNull = true)
| | | | |-- element: array (containsNull = true)
| | | | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- country: string (nullable = true)
| |-- country_code: string (nullable = true)
| |-- full_name: string (nullable = true)
| |-- id: string (nullable = true)
| |-- name: string (nullable = true)
| |-- place_type: string (nullable = true)
| |-- url: string (nullable = true)
|-- possibly_sensitive: boolean (nullable = true)
|-- quoted_status: struct (nullable = true)
| |-- contributors: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- coordinates: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- created_at: string (nullable = true)
| |-- entities: struct (nullable = true)
| | |-- hashtags: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- text: string (nullable = true)
| | |-- media: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- media_url: string (nullable = true)
| | | | |-- media_url_https: string (nullable = true)
| | | | |-- sizes: struct (nullable = true)
| | | | | |-- large: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- medium: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- small: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- thumb: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | |-- source_status_id: long (nullable = true)
| | | | |-- source_status_id_str: string (nullable = true)
| | | | |-- source_user_id: long (nullable = true)
| | | | |-- source_user_id_str: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- url: string (nullable = true)
| | |-- symbols: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- text: string (nullable = true)
| | |-- urls: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- url: string (nullable = true)
| | |-- user_mentions: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- name: string (nullable = true)
| | | | |-- screen_name: string (nullable = true)
| |-- extended_entities: struct (nullable = true)
| | |-- media: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- media_url: string (nullable = true)
| | | | |-- media_url_https: string (nullable = true)
| | | | |-- sizes: struct (nullable = true)
| | | | | |-- large: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- medium: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- small: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- thumb: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | |-- source_status_id: long (nullable = true)
| | | | |-- source_status_id_str: string (nullable = true)
| | | | |-- source_user_id: long (nullable = true)
| | | | |-- source_user_id_str: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- url: string (nullable = true)
| | | | |-- video_info: struct (nullable = true)
| | | | | |-- aspect_ratio: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- duration_millis: long (nullable = true)
| | | | | |-- variants: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- bitrate: long (nullable = true)
| | | | | | | |-- content_type: string (nullable = true)
| | | | | | | |-- url: string (nullable = true)
| |-- favorite_count: long (nullable = true)
| |-- favorited: boolean (nullable = true)
| |-- filter_level: string (nullable = true)
| |-- geo: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- id: long (nullable = true)
| |-- id_str: string (nullable = true)
| |-- in_reply_to_screen_name: string (nullable = true)
| |-- in_reply_to_status_id: long (nullable = true)
| |-- in_reply_to_status_id_str: string (nullable = true)
| |-- in_reply_to_user_id: long (nullable = true)
| |-- in_reply_to_user_id_str: string (nullable = true)
| |-- is_quote_status: boolean (nullable = true)
| |-- lang: string (nullable = true)
| |-- place: string (nullable = true)
| |-- possibly_sensitive: boolean (nullable = true)
| |-- quoted_status_id: long (nullable = true)
| |-- quoted_status_id_str: string (nullable = true)
| |-- retweet_count: long (nullable = true)
| |-- retweeted: boolean (nullable = true)
| |-- scopes: struct (nullable = true)
| | |-- followers: boolean (nullable = true)
| |-- source: string (nullable = true)
| |-- text: string (nullable = true)
| |-- truncated: boolean (nullable = true)
| |-- user: struct (nullable = true)
| | |-- contributors_enabled: boolean (nullable = true)
| | |-- created_at: string (nullable = true)
| | |-- default_profile: boolean (nullable = true)
| | |-- default_profile_image: boolean (nullable = true)
| | |-- description: string (nullable = true)
| | |-- favourites_count: long (nullable = true)
| | |-- follow_request_sent: string (nullable = true)
| | |-- followers_count: long (nullable = true)
| | |-- following: string (nullable = true)
| | |-- friends_count: long (nullable = true)
| | |-- geo_enabled: boolean (nullable = true)
| | |-- id: long (nullable = true)
| | |-- id_str: string (nullable = true)
| | |-- is_translator: boolean (nullable = true)
| | |-- lang: string (nullable = true)
| | |-- listed_count: long (nullable = true)
| | |-- location: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- notifications: string (nullable = true)
| | |-- profile_background_color: string (nullable = true)
| | |-- profile_background_image_url: string (nullable = true)
| | |-- profile_background_image_url_https: string (nullable = true)
| | |-- profile_background_tile: boolean (nullable = true)
| | |-- profile_banner_url: string (nullable = true)
| | |-- profile_image_url: string (nullable = true)
| | |-- profile_image_url_https: string (nullable = true)
| | |-- profile_link_color: string (nullable = true)
| | |-- profile_sidebar_border_color: string (nullable = true)
| | |-- profile_sidebar_fill_color: string (nullable = true)
| | |-- profile_text_color: string (nullable = true)
| | |-- profile_use_background_image: boolean (nullable = true)
| | |-- protected: boolean (nullable = true)
| | |-- screen_name: string (nullable = true)
| | |-- statuses_count: long (nullable = true)
| | |-- time_zone: string (nullable = true)
| | |-- url: string (nullable = true)
| | |-- utc_offset: long (nullable = true)
| | |-- verified: boolean (nullable = true)
|-- quoted_status_id: long (nullable = true)
|-- quoted_status_id_str: string (nullable = true)
|-- retweet_count: long (nullable = true)
|-- retweeted: boolean (nullable = true)
|-- retweeted_status: struct (nullable = true)
| |-- contributors: string (nullable = true)
| |-- coordinates: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- created_at: string (nullable = true)
| |-- entities: struct (nullable = true)
| | |-- hashtags: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- text: string (nullable = true)
| | |-- media: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- media_url: string (nullable = true)
| | | | |-- media_url_https: string (nullable = true)
| | | | |-- sizes: struct (nullable = true)
| | | | | |-- large: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- medium: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- small: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- thumb: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | |-- source_status_id: long (nullable = true)
| | | | |-- source_status_id_str: string (nullable = true)
| | | | |-- source_user_id: long (nullable = true)
| | | | |-- source_user_id_str: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- url: string (nullable = true)
| | |-- symbols: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- text: string (nullable = true)
| | |-- urls: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- url: string (nullable = true)
| | |-- user_mentions: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- name: string (nullable = true)
| | | | |-- screen_name: string (nullable = true)
| |-- extended_entities: struct (nullable = true)
| | |-- media: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- media_url: string (nullable = true)
| | | | |-- media_url_https: string (nullable = true)
| | | | |-- sizes: struct (nullable = true)
| | | | | |-- large: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- medium: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- small: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- thumb: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | |-- source_status_id: long (nullable = true)
| | | | |-- source_status_id_str: string (nullable = true)
| | | | |-- source_user_id: long (nullable = true)
| | | | |-- source_user_id_str: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- url: string (nullable = true)
| | | | |-- video_info: struct (nullable = true)
| | | | | |-- aspect_ratio: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- duration_millis: long (nullable = true)
| | | | | |-- variants: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- bitrate: long (nullable = true)
| | | | | | | |-- content_type: string (nullable = true)
| | | | | | | |-- url: string (nullable = true)
| |-- favorite_count: long (nullable = true)
| |-- favorited: boolean (nullable = true)
| |-- filter_level: string (nullable = true)
| |-- geo: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- id: long (nullable = true)
| |-- id_str: string (nullable = true)
| |-- in_reply_to_screen_name: string (nullable = true)
| |-- in_reply_to_status_id: long (nullable = true)
| |-- in_reply_to_status_id_str: string (nullable = true)
| |-- in_reply_to_user_id: long (nullable = true)
| |-- in_reply_to_user_id_str: string (nullable = true)
| |-- is_quote_status: boolean (nullable = true)
| |-- lang: string (nullable = true)
| |-- place: struct (nullable = true)
| | |-- bounding_box: struct (nullable = true)
| | | |-- coordinates: array (nullable = true)
| | | | |-- element: array (containsNull = true)
| | | | | |-- element: array (containsNull = true)
| | | | | | |-- element: double (containsNull = true)
| | | |-- type: string (nullable = true)
| | |-- country: string (nullable = true)
| | |-- country_code: string (nullable = true)
| | |-- full_name: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- place_type: string (nullable = true)
| | |-- url: string (nullable = true)
| |-- possibly_sensitive: boolean (nullable = true)
| |-- quoted_status: struct (nullable = true)
| | |-- contributors: array (nullable = true)
| | | |-- element: long (containsNull = true)
| | |-- coordinates: struct (nullable = true)
| | | |-- coordinates: array (nullable = true)
| | | | |-- element: double (containsNull = true)
| | | |-- type: string (nullable = true)
| | |-- created_at: string (nullable = true)
| | |-- entities: struct (nullable = true)
| | | |-- hashtags: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- indices: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- text: string (nullable = true)
| | | |-- media: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- display_url: string (nullable = true)
| | | | | |-- expanded_url: string (nullable = true)
| | | | | |-- id: long (nullable = true)
| | | | | |-- id_str: string (nullable = true)
| | | | | |-- indices: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- media_url: string (nullable = true)
| | | | | |-- media_url_https: string (nullable = true)
| | | | | |-- sizes: struct (nullable = true)
| | | | | | |-- large: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | | |-- medium: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | | |-- small: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | | |-- thumb: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | |-- source_status_id: long (nullable = true)
| | | | | |-- source_status_id_str: string (nullable = true)
| | | | | |-- source_user_id: long (nullable = true)
| | | | | |-- source_user_id_str: string (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | | |-- url: string (nullable = true)
| | | |-- symbols: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- indices: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- text: string (nullable = true)
| | | |-- urls: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- display_url: string (nullable = true)
| | | | | |-- expanded_url: string (nullable = true)
| | | | | |-- indices: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- url: string (nullable = true)
| | | |-- user_mentions: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- id: long (nullable = true)
| | | | | |-- id_str: string (nullable = true)
| | | | | |-- indices: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- name: string (nullable = true)
| | | | | |-- screen_name: string (nullable = true)
| | |-- extended_entities: struct (nullable = true)
| | | |-- media: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- display_url: string (nullable = true)
| | | | | |-- expanded_url: string (nullable = true)
| | | | | |-- id: long (nullable = true)
| | | | | |-- id_str: string (nullable = true)
| | | | | |-- indices: array (nullable = true)
| | | | | | |-- element: long (containsNull = true)
| | | | | |-- media_url: string (nullable = true)
| | | | | |-- media_url_https: string (nullable = true)
| | | | | |-- sizes: struct (nullable = true)
| | | | | | |-- large: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | | |-- medium: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | | |-- small: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | | |-- thumb: struct (nullable = true)
| | | | | | | |-- h: long (nullable = true)
| | | | | | | |-- resize: string (nullable = true)
| | | | | | | |-- w: long (nullable = true)
| | | | | |-- source_status_id: long (nullable = true)
| | | | | |-- source_status_id_str: string (nullable = true)
| | | | | |-- source_user_id: long (nullable = true)
| | | | | |-- source_user_id_str: string (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | | |-- url: string (nullable = true)
| | | | | |-- video_info: struct (nullable = true)
| | | | | | |-- aspect_ratio: array (nullable = true)
| | | | | | | |-- element: long (containsNull = true)
| | | | | | |-- duration_millis: long (nullable = true)
| | | | | | |-- variants: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- bitrate: long (nullable = true)
| | | | | | | | |-- content_type: string (nullable = true)
| | | | | | | | |-- url: string (nullable = true)
| | |-- favorite_count: long (nullable = true)
| | |-- favorited: boolean (nullable = true)
| | |-- filter_level: string (nullable = true)
| | |-- geo: struct (nullable = true)
| | | |-- coordinates: array (nullable = true)
| | | | |-- element: double (containsNull = true)
| | | |-- type: string (nullable = true)
| | |-- id: long (nullable = true)
| | |-- id_str: string (nullable = true)
| | |-- in_reply_to_screen_name: string (nullable = true)
| | |-- in_reply_to_status_id: long (nullable = true)
| | |-- in_reply_to_status_id_str: string (nullable = true)
| | |-- in_reply_to_user_id: long (nullable = true)
| | |-- in_reply_to_user_id_str: string (nullable = true)
| | |-- is_quote_status: boolean (nullable = true)
| | |-- lang: string (nullable = true)
| | |-- place: string (nullable = true)
| | |-- possibly_sensitive: boolean (nullable = true)
| | |-- quoted_status_id: long (nullable = true)
| | |-- quoted_status_id_str: string (nullable = true)
| | |-- retweet_count: long (nullable = true)
| | |-- retweeted: boolean (nullable = true)
| | |-- scopes: struct (nullable = true)
| | | |-- followers: boolean (nullable = true)
| | |-- source: string (nullable = true)
| | |-- text: string (nullable = true)
| | |-- truncated: boolean (nullable = true)
| | |-- user: struct (nullable = true)
| | | |-- contributors_enabled: boolean (nullable = true)
| | | |-- created_at: string (nullable = true)
| | | |-- default_profile: boolean (nullable = true)
| | | |-- default_profile_image: boolean (nullable = true)
| | | |-- description: string (nullable = true)
| | | |-- favourites_count: long (nullable = true)
| | | |-- follow_request_sent: string (nullable = true)
| | | |-- followers_count: long (nullable = true)
| | | |-- following: string (nullable = true)
| | | |-- friends_count: long (nullable = true)
| | | |-- geo_enabled: boolean (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- is_translator: boolean (nullable = true)
| | | |-- lang: string (nullable = true)
| | | |-- listed_count: long (nullable = true)
| | | |-- location: string (nullable = true)
| | | |-- name: string (nullable = true)
| | | |-- notifications: string (nullable = true)
| | | |-- profile_background_color: string (nullable = true)
| | | |-- profile_background_image_url: string (nullable = true)
| | | |-- profile_background_image_url_https: string (nullable = true)
| | | |-- profile_background_tile: boolean (nullable = true)
| | | |-- profile_banner_url: string (nullable = true)
| | | |-- profile_image_url: string (nullable = true)
| | | |-- profile_image_url_https: string (nullable = true)
| | | |-- profile_link_color: string (nullable = true)
| | | |-- profile_sidebar_border_color: string (nullable = true)
| | | |-- profile_sidebar_fill_color: string (nullable = true)
| | | |-- profile_text_color: string (nullable = true)
| | | |-- profile_use_background_image: boolean (nullable = true)
| | | |-- protected: boolean (nullable = true)
| | | |-- screen_name: string (nullable = true)
| | | |-- statuses_count: long (nullable = true)
| | | |-- time_zone: string (nullable = true)
| | | |-- url: string (nullable = true)
| | | |-- utc_offset: long (nullable = true)
| | | |-- verified: boolean (nullable = true)
| |-- quoted_status_id: long (nullable = true)
| |-- quoted_status_id_str: string (nullable = true)
| |-- retweet_count: long (nullable = true)
| |-- retweeted: boolean (nullable = true)
| |-- scopes: struct (nullable = true)
| | |-- followers: boolean (nullable = true)
| |-- source: string (nullable = true)
| |-- text: string (nullable = true)
| |-- truncated: boolean (nullable = true)
| |-- user: struct (nullable = true)
| | |-- contributors_enabled: boolean (nullable = true)
| | |-- created_at: string (nullable = true)
| | |-- default_profile: boolean (nullable = true)
| | |-- default_profile_image: boolean (nullable = true)
| | |-- description: string (nullable = true)
| | |-- favourites_count: long (nullable = true)
| | |-- follow_request_sent: string (nullable = true)
| | |-- followers_count: long (nullable = true)
| | |-- following: string (nullable = true)
| | |-- friends_count: long (nullable = true)
| | |-- geo_enabled: boolean (nullable = true)
| | |-- id: long (nullable = true)
| | |-- id_str: string (nullable = true)
| | |-- is_translator: boolean (nullable = true)
| | |-- lang: string (nullable = true)
| | |-- listed_count: long (nullable = true)
| | |-- location: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- notifications: string (nullable = true)
| | |-- profile_background_color: string (nullable = true)
| | |-- profile_background_image_url: string (nullable = true)
| | |-- profile_background_image_url_https: string (nullable = true)
| | |-- profile_background_tile: boolean (nullable = true)
| | |-- profile_banner_url: string (nullable = true)
| | |-- profile_image_url: string (nullable = true)
| | |-- profile_image_url_https: string (nullable = true)
| | |-- profile_link_color: string (nullable = true)
| | |-- profile_sidebar_border_color: string (nullable = true)
| | |-- profile_sidebar_fill_color: string (nullable = true)
| | |-- profile_text_color: string (nullable = true)
| | |-- profile_use_background_image: boolean (nullable = true)
| | |-- protected: boolean (nullable = true)
| | |-- screen_name: string (nullable = true)
| | |-- statuses_count: long (nullable = true)
| | |-- time_zone: string (nullable = true)
| | |-- url: string (nullable = true)
| | |-- utc_offset: long (nullable = true)
| | |-- verified: boolean (nullable = true)
|-- scopes: struct (nullable = true)
| |-- place_ids: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- source: string (nullable = true)
|-- text: string (nullable = true)
|-- timestamp_ms: string (nullable = true)
|-- truncated: boolean (nullable = true)
|-- user: struct (nullable = true)
| |-- contributors_enabled: boolean (nullable = true)
| |-- created_at: string (nullable = true)
| |-- default_profile: boolean (nullable = true)
| |-- default_profile_image: boolean (nullable = true)
| |-- description: string (nullable = true)
| |-- favourites_count: long (nullable = true)
| |-- follow_request_sent: string (nullable = true)
| |-- followers_count: long (nullable = true)
| |-- following: string (nullable = true)
| |-- friends_count: long (nullable = true)
| |-- geo_enabled: boolean (nullable = true)
| |-- id: long (nullable = true)
| |-- id_str: string (nullable = true)
| |-- is_translator: boolean (nullable = true)
| |-- lang: string (nullable = true)
| |-- listed_count: long (nullable = true)
| |-- location: string (nullable = true)
| |-- name: string (nullable = true)
| |-- notifications: string (nullable = true)
| |-- profile_background_color: string (nullable = true)
| |-- profile_background_image_url: string (nullable = true)
| |-- profile_background_image_url_https: string (nullable = true)
| |-- profile_background_tile: boolean (nullable = true)
| |-- profile_banner_url: string (nullable = true)
| |-- profile_image_url: string (nullable = true)
| |-- profile_image_url_https: string (nullable = true)
| |-- profile_link_color: string (nullable = true)
| |-- profile_sidebar_border_color: string (nullable = true)
| |-- profile_sidebar_fill_color: string (nullable = true)
| |-- profile_text_color: string (nullable = true)
| |-- profile_use_background_image: boolean (nullable = true)
| |-- protected: boolean (nullable = true)
| |-- screen_name: string (nullable = true)
| |-- statuses_count: long (nullable = true)
| |-- time_zone: string (nullable = true)
| |-- url: string (nullable = true)
| |-- utc_offset: long (nullable = true)
| |-- verified: boolean (nullable = true)
In [43]:
start_time = time.time()
query = """
select
sq2.time_zone as time_zone,
sq2.mentions as mentions,
sq2.clinton_rank as clinton_rank,
sq2.trump_rank as trump_rank,
sq2.sanders_rank as sanders_rank
from (
select
sq.time_zone as time_zone,
sq.clinton + sq.trump + sq.sanders as mentions,
dense_rank() over (order by sq.clinton desc) as clinton_rank,
dense_rank() over (order by sq.trump desc) as trump_rank,
dense_rank() over (order by sq.sanders desc) as sanders_rank
from (
select
user.time_zone as time_zone,
sum(case
when lower(text) like '%clinton%' or lower(text) like '%hillary%'
then 1
else 0
end) as clinton,
sum(case
when lower(text) like '%trump%' or lower(text) like '%donald%'
then 1
else 0
end) as trump,
sum(case
when lower(text) like '%sanders%' or lower(text) like '%bernie%'
then 1
else 0
end) as sanders
from tweets
group by user.time_zone
) sq
) sq2
where
sq2.clinton_rank < 30
or sq2.trump_rank < 30
or sq2.sanders_rank < 30
order by sq2.mentions desc
"""
pdf = sqlContext.sql(query).toPandas()
elapsed_time = time.time() - start_time
print "%02f seconds" % elapsed_time
pdf
3.718285 seconds
Out[43]:
time_zone
mentions
clinton_rank
trump_rank
sanders_rank
0
None
112310
1
1
1
1
Eastern Time (US & Canada)
39572
2
2
3
2
Pacific Time (US & Canada)
39096
3
3
2
3
Central Time (US & Canada)
21007
4
4
4
4
Atlantic Time (Canada)
5842
5
5
5
5
Quito
4924
6
6
7
6
Mountain Time (US & Canada)
4452
7
7
6
7
Arizona
3472
8
8
8
8
London
2968
9
9
9
9
Hawaii
1532
11
11
10
10
Amsterdam
1441
10
10
15
11
Alaska
1153
12
12
12
12
America/New_York
1096
13
13
11
13
Athens
906
15
15
16
14
Brasilia
819
18
14
17
15
America/Los_Angeles
742
17
20
13
16
Paris
706
19
16
18
17
America/Chicago
703
14
21
14
18
Caracas
663
16
17
21
19
International Date Line West
513
20
18
28
20
Casablanca
485
23
23
19
21
New Delhi
471
27
19
30
22
Madrid
469
28
22
20
23
Tehran
460
21
29
16
24
Mexico City
451
24
23
23
25
Greenland
395
22
26
24
26
Indiana (East)
353
25
33
18
27
Chennai
317
33
24
39
28
Rome
313
32
27
30
29
Dublin
293
34
32
23
30
Santiago
293
35
29
30
31
Bangkok
286
48
25
35
32
Central America
254
29
38
26
33
Bern
246
30
44
23
34
Beijing
246
47
28
43
35
Tijuana
240
38
42
22
36
Brussels
217
41
41
25
37
Stockholm
187
43
49
27
38
America/Denver
151
43
61
29
39
Warsaw
130
26
77
60
In [ ]:
Content source: hilljb/spark-jupyter
Similar notebooks: